{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from numpy import linspace, loadtxt, ones, convolve\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import collections\n",
    "from random import randint\n",
    "from matplotlib import style\n",
    "style.use('fivethirtyeight')\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import zip_longest, count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !mkdir -p dataset\n",
    "# !wget -c -b http://www-personal.umich.edu/~mejn/cp/data/sunspots.txt -P dataset\n",
    "data = loadtxt(\"dataset/sunspots.txt\", float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_as_frame = pd.DataFrame(data, columns=['Months', 'SunSpots'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Months</th>\n",
       "      <th>SunSpots</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>58.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>62.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>70.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.0</td>\n",
       "      <td>55.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4.0</td>\n",
       "      <td>85.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Months  SunSpots\n",
       "0     0.0      58.0\n",
       "1     1.0      62.6\n",
       "2     2.0      70.0\n",
       "3     3.0      55.7\n",
       "4     4.0      85.0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_as_frame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def moving_average(data, window_size):\n",
    "    # rectangular\n",
    "    window = np.ones(int(window_size))/float(window_size)\n",
    "    return np.convolve(data, window, 'same')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def explain_anomalies(y, window_size, sigma=1.0):\n",
    "    avg = moving_average(y, window_size).tolist()\n",
    "    residual = y - avg\n",
    "    # Calculate the variation in the distribution of the residual\n",
    "    std = np.std(residual)\n",
    "    return {'standard_deviation': round(std, 3),\n",
    "        'anomalies_dict': collections.OrderedDict([(index, y_i) for\n",
    "                        index, y_i, avg_i in zip_longest(count(), y, avg)\n",
    "                if (y_i > avg_i + (sigma*std)) | (y_i < avg_i - (sigma*std))])}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def explain_anomalies_rolling_std(y, window_size, sigma=1.0):\n",
    "    avg = moving_average(y, window_size)\n",
    "    avg_list = avg.tolist()\n",
    "    residual = y - avg\n",
    "    # Calculate the variation in the distribution of the residual\n",
    "    testing_std = pd.rolling_std(residual, window_size)\n",
    "    testing_std_as_df = pd.DataFrame(testing_std)\n",
    "    rolling_std = testing_std_as_df.replace(np.nan,\n",
    "            testing_std_as_df.ix[window_size - 1]).round(3).iloc[:,0].tolist()\n",
    "    std = np.std(residual)\n",
    "    return {'stationary standard_deviation': round(std, 3),\n",
    "        'anomalies_dict': collections.OrderedDict([(index, y_i)\n",
    "                for index, y_i, avg_i, rs_i in zip_longest(count(), y, avg_list, r)\n",
    "        if (y_i > avg_i + (sigma * rs_i)) | (y_i < avg_i - (sigma * rs_i))])}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "ename": "IndentationError",
     "evalue": "expected an indented block (<ipython-input-22-174260366424>, line 13)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-22-174260366424>\"\u001b[0;36m, line \u001b[0;32m13\u001b[0m\n\u001b[0;31m    events = explain_anomalies_rolling_std(y, window_size=window_size, sigma=sigma_value)\u001b[0m\n\u001b[0m         ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block\n"
     ]
    }
   ],
   "source": [
    "def plot_results(x, y, window_size, sigma_value=1,\n",
    "text_xlabel=\"X Axis\", text_ylabel=\"Y Axis\", applying_rolling_std=False):\n",
    "    plt.figure(figsize=(15, 8))\n",
    "    plt.plot(x, y, \"k.\")\n",
    "    y_av = moving_average(y, window_size)\n",
    "    plt.plot(x, y_av, color='green')\n",
    "    plt.xlim(0, 1000)\n",
    "    plt.xlabel(text_xlabel)\n",
    "    plt.ylabel(text_ylabel)\n",
    "    # Query for the anomalies and plot the same\n",
    "    events = {}\n",
    "    if applying_rolling_std:\n",
    "        events = explain_anomalies_rolling_std(y, window_size=window_size, sigma=sigma_value)\n",
    "    else:\n",
    "        events = explain_anomalies(y, window_size=window_size, sigma=sigma_value)\n",
    "        \n",
    "    x_anomaly = np.fromiter(events['anomalies_dict'].iterkeys(), dtype=int, count=len(events['anomalies_d\n",
    "    y_anomaly = np.fromiter(events['anomalies_dict'].itervalues(), dtype=float,\n",
    "    count=len(events['anomalies_dict']))\n",
    "    plt.plot(x_anomaly, y_anomaly, \"r*\", markersize=12)\n",
    "    # add grid and lines and enable the plot\n",
    "    plt.grid(True)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = data_as_frame['Months']\n",
    "Y = data_as_frame['SunSpots']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
